{
  "cells": [
    {
      "cell_type": "markdown",
      "source": [
        "\n",
        "# Summary\n",
        "The following is code to perform supervised classification of the two classifiers in our paper (i.e., known vs. unknown others & negative vs. positive/neutral sentiment) using BERT. Training the classifier as well as prediction is carried out using ktrain. Ktrain is a lightweight wrapper for the deep learning library TensorFlow Keras (and other libraries) to help build, train, and deploy neural networks. Ktrain includes BERT transformers.\n",
        "\n",
        "* ktrain documentation: https://amaiya.github.io/ktrain/index.html\n",
        "* ktrain on github: https://github.com/amaiya/ktrain\n",
        "\n",
        "The following code was executed using Google Colab in order to enable GPU processing."
      ],
      "metadata": {
        "id": "IuAvSorBlWp2"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "t13_eLRkMvs6"
      },
      "outputs": [],
      "source": [
        "# Load packages\n",
        "\n",
        "!pip install ktrain\n",
        "!pip install io\n",
        "\n",
        "import sys\n",
        "import scipy\n",
        "import pandas as pd\n",
        "import os\n",
        "import re\n",
        "import io\n",
        "import ktrain\n",
        "from ktrain import text\n",
        "import sklearn\n",
        "from google.colab import drive"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Training: known - unknown classification"
      ],
      "metadata": {
        "id": "qEOcxg_ApMvW"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Import data\n",
        "\n",
        "# note: beforehand mount your drive using drive.mount or similarly\n",
        "\n",
        "# Training data\n",
        "# unbalanced\n",
        "train = pd.read_csv(\"data/training-and-test-data/train_known_unknown.csv\",index_col=0,parse_dates=[0],encoding='ISO-8859-1')\n",
        "# balanced\n",
        "#train = pd.read_csv(\"data/training-and-test-data/train_known_unknown_balanced.csv\",index_col=0,parse_dates=[0],encoding='ISO-8859-1')\n",
        "\n",
        "# Test data\n",
        "test = pd.read_csv(\"data/training-and-test-data/test_known_unknown.csv\",index_col=0,parse_dates=[0],encoding='ISO-8859-1')"
      ],
      "metadata": {
        "id": "xIqfVP4-G_AD"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "Ykv6ulOHNKvu"
      },
      "outputs": [],
      "source": [
        "# Show manual classifications\n",
        "train[\"manual_code_known_unknown\"].value_counts()"
      ]
    },
    {
      "cell_type": "code",
      "source": [
        "# Create lists containing text (= probing_answer) as input\n",
        "# Create human classifier (= manual_code_known_unknown) as target/output for training and test data\n",
        "x_train = train.probing_answer.to_list()\n",
        "y_train = train.manual_code_known_unknown.to_list()\n",
        "x_test = test.probing_answer.to_list()\n",
        "y_test = test.manual_code_known_unknown.to_list()"
      ],
      "metadata": {
        "id": "3Pj88VsTrfPB"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Specify the BERT model that is to be used from hugging face\n",
        "MODEL_NAME = 'bert-base-uncased'"
      ],
      "metadata": {
        "id": "Vu51tMh-uvcO"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# pre-processing and tokenizer from ktrain\n",
        "#Tokenize the data and apply a maximum length and truncation to ensure that all sentences are the same length.\n",
        "#Tokenizing the data converts it to a numerical representation that’s acceptable by the machine learning model.\n",
        "t = text.Transformer(MODEL_NAME, maxlen=256, class_names=[0,1])\n",
        "trn = t.preprocess_train(x_train, y_train)\n",
        "val = t.preprocess_test(x_test, y_test)\n",
        "model = t.get_classifier()\n",
        "model.summary()\n",
        "\n",
        "learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)"
      ],
      "metadata": {
        "id": "bw5FC1emuzQf"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Fit the model\n",
        "learner.autofit(2e-5,early_stopping=True) # early stopping in order to prevent overfitting\n",
        "\n",
        "#learner.lr_find() #to find good leaning rate\n",
        "#learner.lr_plot() #to find good leaning rate\n",
        "#learner.fit_onecycle(2e-5,3)"
      ],
      "metadata": {
        "id": "DnBJNMiDuQ-2"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Validate the model\n",
        "#learner.validate()\n",
        "#learner.evaluate()\n",
        "#learner.validate(class_names=t.get_classes())"
      ],
      "metadata": {
        "id": "SPtE0QA4ueQN"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Store the model\n",
        "p = ktrain.get_predictor(learner.model, t, batch_size=1032)\n",
        "p.save('trained_model_manual_code_known_unknown')"
      ],
      "metadata": {
        "id": "_KR7v7V8u6lQ"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Training: sentiment classification\n"
      ],
      "metadata": {
        "id": "fj5p6EvzvmFY"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Import data\n",
        "\n",
        "# Training data\n",
        "# unbalanced\n",
        "train = pd.read_csv(\"data/training-and-test-data/train_sentiment.csv\",index_col=0,parse_dates=[0],encoding='ISO-8859-1')\n",
        "# balanced\n",
        "#train = pd.read_csv(\"data/training-and-test-data/train_sentiment_balanced.csv\",index_col=0,parse_dates=[0],encoding='ISO-8859-1')\n",
        "\n",
        "# Test data\n",
        "test = pd.read_csv(\"data/training-and-test-data/test_sentiment.csv\",index_col=0,parse_dates=[0],encoding='ISO-8859-1')\n"
      ],
      "metadata": {
        "id": "CzSkiV7bvksR"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Show one manual classifier\n",
        "train[\"manual_code_sentiment_dichotomous\"].value_counts()"
      ],
      "metadata": {
        "id": "KPwxDN5nhoc2"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Create lists containing text (= probing_answer) as input and human classifier (= manual_code_sentiment_dichotomous)\n",
        "# as target/output for training and test data\n",
        "x_train_sentiment = train.probing_answer.to_list()\n",
        "y_train_sentiment = train.manual_code_sentiment_dichotomous.to_list()\n",
        "x_test_sentiment = test.probing_answer.to_list()\n",
        "y_test_sentiment = test.manual_code_sentiment_dichotomous.to_list()"
      ],
      "metadata": {
        "id": "hyRPTWqa4za4"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Specify the BERT model that is to be used from hugging face\n",
        "MODEL_NAME = 'bert-base-uncased'\n",
        "\n",
        "t = text.Transformer(MODEL_NAME, maxlen=256, class_names=[0,1])\n",
        "trn = t.preprocess_train(x_train_sentiment, y_train_sentiment)\n",
        "val = t.preprocess_test(x_test_sentiment, y_test_sentiment)\n",
        "model = t.get_classifier()\n",
        "model.summary()\n",
        "\n",
        "learner = ktrain.get_learner(model, train_data=trn, val_data=val, batch_size=6)\n",
        "\n",
        "\n",
        "# Fit the model\n",
        "learner.autofit(2e-5,early_stopping=True)\n",
        "\n",
        "\n",
        "# Validate the model\n",
        "learner.validate()\n",
        "\n",
        "# Store the model in google drive\n",
        "p = ktrain.get_predictor(learner.model, t, batch_size=1032)\n",
        "p.save('trained_model_manual_code_sentiment_dichotomous')"
      ],
      "metadata": {
        "id": "usMtviIHh0v9"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Evaluation: Accuracy, Precision, Recall, F1 Score"
      ],
      "metadata": {
        "id": "aIAMHLg2691L"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Load predictors from the stored models\n",
        "predictor_known_unknown = ktrain.load_predictor('trained_model_manual_code_known_unknown',batch_size=2000)\n",
        "predictor_sentiment = ktrain.load_predictor('trained_model_manual_code_sentiment_dichotomous',batch_size=2000)"
      ],
      "metadata": {
        "id": "QL6ff2SaYMWK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Known-unknown\n",
        "# using our classifier, we make predictions on our validation set from above and then compare predictions with actual classification\n",
        "\n",
        "y_preds = [predictor_known_unknown.predict(x) for x in x_test]"
      ],
      "metadata": {
        "id": "WLAQL3cZvr7v"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
        "\n",
        "#print(confusion_matrix(y_test,y_preds))\n",
        "print(classification_report(y_test,y_preds))\n",
        "#print(accuracy_score(y_test, y_preds))\n",
        "\n",
        "table = classification_report(y_test, y_preds, output_dict=True)\n",
        "table = pd.DataFrame(table).transpose()"
      ],
      "metadata": {
        "id": "P_HajSJo7Jfs"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Sentiment\n",
        "# using our classifier, we make predictions on our validation set from above and then compare predictions with actual classification\n",
        "\n",
        "y_preds = [predictor_sentiment.predict(x) for x in x_test_sentiment]\n",
        "\n",
        "from sklearn.metrics import classification_report, confusion_matrix, accuracy_score\n",
        "\n",
        "#print(confusion_matrix(y_test,y_preds))\n",
        "print(classification_report(y_test_sentiment,y_preds))\n",
        "#print(accuracy_score(y_test, y_preds))\n",
        "\n",
        "table = classification_report(y_test_sentiment,y_preds, output_dict=True)\n",
        "table = pd.DataFrame(table).transpose()"
      ],
      "metadata": {
        "id": "EnTQvjQ22dFj"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "markdown",
      "source": [
        "# Prediction: Use the model to predict outcome for all observations"
      ],
      "metadata": {
        "id": "5mDCCns7up3W"
      }
    },
    {
      "cell_type": "code",
      "source": [
        "# Load original data again (because we excluded some missings above)\n",
        "# does not have to include oversampled documents (we are not interested in predicting them)\n",
        "data = pd.read_csv(\"data_for_bert.csv\",index_col=0,parse_dates=[0],encoding='ISO-8859-1')"
      ],
      "metadata": {
        "id": "grqUnPWVoQOA"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Drop missings on open-ended response\n",
        "data = data.dropna(subset=['probing_answer'])\n",
        "\n",
        "# Show data length\n",
        "len(data.index)"
      ],
      "metadata": {
        "id": "1cTuEshY7-Jq"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Make predictions and add them as variables (for both classifiers)\n",
        "data['bert_prediction_known_unknown'] = [predictor_known_unknown.predict(x) for x in data['probing_answer']]\n",
        "data['bert_prediction_sentiment'] = [predictor_sentiment.predict(x) for x in data['probing_answer']]"
      ],
      "metadata": {
        "id": "-9hWFVI9YtaK"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Check number of observations again\n",
        "len(data.index)\n",
        "\n",
        "# Show column names\n",
        "for col in data.columns:\n",
        "    print(col)"
      ],
      "metadata": {
        "id": "-8rdtUW9ZpV_"
      },
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "source": [
        "# Save dataframe as CSV\n",
        "\n",
        "# Specify the path and filename where the data should be stored\n",
        "path = 'data_bert_predictions.csv'\n",
        "\n",
        "# Write the file into the folder\n",
        "with open(path, 'w', encoding = 'utf-8-sig') as f:\n",
        "  data.to_csv(f)"
      ],
      "metadata": {
        "id": "2HSMXlrraBiu"
      },
      "execution_count": null,
      "outputs": []
    }
  ],
  "metadata": {
    "colab": {
      "provenance": []
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    },
    "gpuClass": "standard",
    "accelerator": "GPU"
  },
  "nbformat": 4,
  "nbformat_minor": 0
}